###########################################
d2020 <- bind_rows(fread("raw_data/nc_data/Kevin Morris - 2019-2021 Individual Death Records/2020 NC Occurrence Individual Deaths.csv"),
                   fread("raw_data/nc_data/Kevin Morris - 2019-2021 Individual Death Records/2021 NC Occurrence Individual Deaths.csv"))

colnames(d2020) <- tolower(unlist(d2020[3, ]))
d2020 <- d2020[4:nrow(d2020), ]

hold <- d2020

d2020 <- clean_streets(d2020, c("addrnum", "addrpred", "addrname", "addrsuff",
                                "addrpost", "addrapt"))

d2020 <- d2020 |> 
  mutate(death_date = make_date(year = dod_yr, month = dod_mo, day = dod_dy),
         birth_date = make_date(year = dob_yr, month = dob_mo, day = dob_dy),
         covid = cod1 == "U071",
         decd_age_in_years = floor(interval(birth_date, death_date) / years(1))) |>
  select(residence_street = street,
         residence_city = citytext_r, decd_age_in_years,
         decd_first_name = fname,
         decd_last_name = lname, birth_date, covid, death_date,
         residence_state = stres)

d2020 <- mutate(d2020,
                residence_street = toupper(gsub('[[:punct:] ]+', ' ', residence_street)))

d2020 <- filter(d2020, !grepl("UNKNOWN", residence_street),
                !grepl("P O BOX", residence_street),
                !grepl("PO BOX", residence_street),
                !grepl("POBOX", residence_street),
                !grepl("HOMELESS", residence_street),
                !grepl("TRANSIENT", residence_street),
                !grepl("NO PERMANENT ADDRESS", residence_street),
                trimws(residence_street) != "",
                residence_state == "NC")

d2020 <- mutate(d2020, residence_street = trimws(gsub("\\s+", " ", residence_street)))

d2020 <- mutate(d2020, residence_street = gsub(" APT ", " ", residence_street))
d2020 <- mutate(d2020, residence_street = gsub(" APARTMENT ", " ", residence_street))

ad_cl <- fread("raw_data/address_cleaner.csv") |> 
  filter(search != replace) |> 
  mutate(across(c(search, replace), toupper)) |> 
  distinct()

for(i in c(1:nrow(ad_cl))){
  print(i)
  d2020$residence_street <- gsub(paste0("(^|\\s)", ad_cl$search[i], "(\\s|$)"),
                                 paste0("\\1", ad_cl$replace[i], "\\2"),
                                 d2020$residence_street)
}

d2020 <- d2020 |> 
  mutate(across(c("decd_first_name", "decd_last_name"),
                ~ gsub("[[:punct:]]| ", "", ifelse(. == "", NA, tolower(iconv(., "WINDOWS-1252", "UTF-8"))))),
         residence_city = toupper(gsub("[[:punct:]]", "", residence_city)))

d2020$residence_city <- ifelse(d2020$residence_city == "WINSTONSALEM",
                               "WINSTON SALEM",
                               d2020$residence_city)

d2020$residence_city <- ifelse(d2020$residence_city == "FUQUAYVARINA",
                               "FUQUAY VARINA",
                               d2020$residence_city)

d2020 <- mutate(d2020,
                residence_city = gsub("MT ", "MOUNT", residence_city),
                residence_city = gsub("ST ", "SAINT", residence_city))



d2020$residence_street <- gsub("NORTH", "N", d2020$residence_street)
d2020$residence_street <- gsub("SOUTH", "S", d2020$residence_street)
d2020$residence_street <- gsub("EAST", "E", d2020$residence_street)
d2020$residence_street <- gsub("WEST", "W", d2020$residence_street)

d2020 <- d2020 |> 
  extract(residence_street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)

saveRDS(d2020, "temp/cleaned_death_nc.rds")

d2020 <- readRDS("temp/cleaned_death_nc.rds")

#####################################

k <- readRDS("temp/cleaned_file_nc.rds")

k <- k |> 
  mutate(across(c("Residence_Addresses_City"),
                ~ toupper(gsub("[[:punct:]]", "", .))))

k$street <- gsub("NORTH", "N", k$street)
k$street <- gsub("SOUTH", "S", k$street)
k$street <- gsub("EAST", "E", k$street)
k$street <- gsub("WEST", "W", k$street)
k$street <- gsub(" APARTMENT ", " ", k$street)
k$street <- gsub(" APT ", " ", k$street)

k <- filter(mutate(k, Voters_BirthDate = as.Date(Voters_BirthDate, "%m/%d/%Y")),
            !(paste0(Voters_BirthDate,
                     street, Residence_Addresses_City)) %in% 
              with(d2020, paste0(birth_date, residence_street, residence_city)))

k <- k |> 
  extract(street, into = c("num", "rest"), "(.*?)\\s+(.*)", remove = FALSE)

k$rest <- paste(k$rest, k$Residence_Addresses_City)

d2020 <- d2020 |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         rest = paste(rest, residence_city)) |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03")

good <- filter(d2020, rest %in% k$rest)

bad <- filter(d2020, !(rest %in% k$rest))

bad_ads <- bad |> 
  group_by(rest) |> 
  tally()


bad_ads$best <- amatch(bad_ads$rest,
                       k$rest,
                       maxDist = 3, method = "lv")

bad_ads$new <- k$rest[bad_ads$best]

bad_ads$dist <- stringdist(bad_ads$rest, bad_ads$new, method = "lv")

bad <- left_join(bad, bad_ads) |> 
  select(-n)

good <- good |> 
  mutate(dist = 0,
         new = rest)

full <- bind_rows(good, bad)

saveRDS(full, "temp/nc_death_with_dists.rds")
#######################################################
#######################################################
#######################################################

one_per <- readRDS("temp/nc_death_with_dists.rds") |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         pre = death_date < "2021-05-03") |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist == 0) |> 
  group_by(residence_street, residence_city, pre) |> 
  summarize(across(c(death_date), mean),
            age = mean(decd_age_in_years),
            covid = max(covid),
            n_dead = n(),
            last_name = min(decd_last_name))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)

############################################################


k_m <- left_join(k, tre,
                 by = c("street" = "residence_street",
                        "Residence_Addresses_City" = "residence_city"))


k_m <- mutate(k_m,
              n_dead = ifelse(is.na(n_dead), 0, n_dead),
              treated = ifelse(is.na(covid), 0,
                               ifelse(covid, 2, 1)))
saveRDS(filter(k_m, treated > 0), "temp/vf_nc.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -num, -rest, -pre)


k_m2 <- inner_join(k_m2, pra,
                   by = c("street" = "residence_street",
                          "Residence_Addresses_City" = "residence_city"))

saveRDS(k_m2, "temp/vf_later_nc.rds")
cleanup("k")
#################################################
#################################################
#################################################


one_per <- readRDS("temp/nc_death_with_dists.rds") |> 
  mutate(death_date = as.Date(death_date, "%m/%d/%Y"),
         pre = death_date < "2021-05-03") |> 
  filter(death_date <= "2020-12-31" | death_date >= "2021-05-03",
         dist < 3, !is.na(dist)) |> 
  mutate(rest = new) |> 
  group_by(num, rest, pre) |> 
  summarize(across(c(death_date), mean),
            age = mean(decd_age_in_years),
            covid = max(covid),
            n_dead = n(),
            last_name = min(decd_last_name))

tre <- filter(one_per, pre)

pra <- filter(one_per, !pre)

#####################################


k_m <- left_join(k, tre,
                 by = c("num", "rest"))


k_m <- mutate(k_m,
              n_dead = ifelse(is.na(n_dead), 0, n_dead),
              treated = ifelse(is.na(covid), 0,
                               ifelse(covid, 2, 1)))

saveRDS(filter(k_m, treated > 0), "temp/vf_nc_dist.rds")


k_m2 <- k_m |> 
  filter(treated == 0) |> 
  select(-death_date, -age, -covid, -n_dead,
         -last_name, -treated, -pre)


k_m2 <- inner_join(k_m2, pra,
                   by = c("num", "rest"))

saveRDS(k_m2, "temp/vf_later_nc_dist.rds")

